# Packages
library(haven)
library(tidyverse)
library(naniar)
library(scales)
library(lme4)
library(jtools)
library(stargazer)
library(sjPlot)
library(performance)
library(nlme)
library(lattice)
library(sjmisc)
library(sjPlot)
library(glmmTMB)
library(psych)
Data Source: Understanding Society – The UK Household Longitudinal Study, “w_indresp.dta”, w means wave a to wave l (expect wave h) Download from: UK Data Service-SN6614-Understanding Society: Waves 1-12, 2009-2021 and Harmonised BHPS: Waves 1-18, 1991-2009
The data are from the Understanding Society The UK Longitudinal study, where this dissertation selected the survey data from wave1 in 2009 to wave12 in 2021 indresp.dta, except for wave 8 survey. Because the main data of political Interest were omitted in wave 8 survey data. Therefore, there are total 11 waves data needed to study. The following most important thing to do is to combine the datasets from the 11 independent files into one longitudinal dataset.
Based on the definition of older people, the project will select the observations aged between 60 and 69 in the first wave and attend all 11 waves(Fletcher, 2021).
The project will collect the datat from the Understanding Society “w_indresp.dta”. Because the data is scattered in 11 separate dta files, the project needs to be merged into one longitudinal dataset.
Based on the research question, the project will select 1 dependent variable and 5 independent variables. The dependent variable named political interest. In the survey, political interest is expressed as “_vote6”. The value 1 means the respondent does not interested in politics. The value 2 means the respondent not very interested in political interest. The value 3 means the respondent fairly interested in politics. The value 4 means the respondent very interested in politics. The first independent variable is age. In the survey, age is expressed as “_dvage”. The second independent variable is retirement. In the survey, retirement is expressed as “a_retdatey”(wave 1), “_jbendreas”(wave2-6), “_jbendreas6”(wave7-11). The value 1 of retirement means the older people is retired. The value 0 means the older people does not retire. The third independent variable is health. In the survey, health status is expressed as “_health”. Health is a binary variable, value 1 means the observation is long-standing illness or disability, and value 0 means the observations does not long-standing illness or disability. For the variavle for the married status in the survey is “_marstat_dv”. The fourth independent variable widow and divorce could be found in “_marstat_dv”. The fifthe variable income is continuous varibale.
# Empty tibble
temp_data <- tibble()
# 11 years(waves)
year_letters <- c("a", "b", "c", "d","e","f","g","i","j","k","l")
year_nums <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
# Finding the variables from 11 data files
for (i in 1:length(year_letters)) {
year_letter <- year_letters[i]
year_num <- year_nums[i]
if (file.exists(paste0(year_letter, "_indresp.dta"))) {
year_data <- read_dta(paste0(year_letter, "_indresp.dta"))
if(year_num==1){
# Finding the variables
selected_variables <- year_data %>%
rename(pidp = pidp,
PI = paste0(year_letter, "_vote6"),
age = paste0(year_letter, "_dvage"),
employment = paste0(year_letter, "_employ"),
sex = paste0(year_letter, "_sex_dv"),
health = paste0(year_letter, "_health"),
edu = paste0(year_letter, "_qfhigh_dv"),
marital_status = paste0(year_letter, "_marstat_dv"),
income = paste0(year_letter, "_fimnnet_dv"),
retirement = a_retdatey) %>%
select(pidp, PI, age, employment, sex, health, edu,marital_status, income, retirement) %>%
mutate(wave = year_num,
retirement = ifelse(retirement>=0,1,NA))
}
if(year_num %in% 2:6){
# Finding the variables
selected_variables <- year_data %>%
rename(pidp = pidp,
PI = paste0(year_letter, "_vote6"),
age = paste0(year_letter, "_dvage"),
employment = paste0(year_letter, "_employ"),
sex = paste0(year_letter, "_sex_dv"),
health = paste0(year_letter, "_health"),
edu = paste0(year_letter, "_qfhigh_dv"),
marital_status = paste0(year_letter, "_marstat_dv"),
income = paste0(year_letter, "_fimnnet_dv"),
retirement = paste0(year_letter, "_jbendreas")) %>%
select(pidp, PI, age, employment, sex, health, edu,marital_status, income, retirement) %>%
mutate(wave = year_num,
retirement = ifelse(retirement==6,1,NA))
}
if(year_num %in% 7:11){
# Finding the variables
selected_variables <- year_data %>%
rename(pidp = pidp,
PI = paste0(year_letter, "_vote6"),
age = paste0(year_letter, "_dvage"),
employment = paste0(year_letter, "_employ"),
sex = paste0(year_letter, "_sex_dv"),
health = paste0(year_letter, "_health"),
edu = paste0(year_letter, "_qfhigh_dv"),
marital_status = paste0(year_letter, "_marstat_dv"),
income = paste0(year_letter, "_fimnnet_dv"),
retirement = paste0(year_letter, "_jbendreas6")) %>%
select(pidp, PI, age, employment, sex, health, edu,marital_status, income, retirement) %>%
mutate(wave = year_num,
retirement = ifelse(retirement==1,1,NA))
}
# Combine data together
temp_data <- bind_rows(temp_data, selected_variables)
}
}
# Deal with retirement
temp_data = temp_data %>%
arrange(pidp,wave) %>%
group_by(pidp) %>%
fill(retirement,.direction = 'down') %>%
ungroup() %>%
mutate(retirement = ifelse(is.na(retirement),0,retirement))
# Define NA missing value
temp_data$PI[temp_data$PI %in% c(-10, -9, -8, -7, -2, -1)] <- NA
temp_data$edu[temp_data$edu %in% c(-9, -8)] <- NA
temp_data$sex[temp_data$sex == -9] <- NA
temp_data$employment[temp_data$employment %in% c(-9, -8, -2, -1)] <- NA
temp_data$health[temp_data$health %in% c(-9, -8, -2, -1)] <- NA
temp_data$marital_status[temp_data$marital_status%in% c(-9, 0)] <- NA
temp_data$income[temp_data$income %in% c(-9, -4)] <- NA
# Delete NA missing value
temp_data <- na.omit(temp_data)
# PI order change
# According to the survey design(Understanding Society, 2023), the lower numeric result shows that people more interested in PI. However, this is a departure from normal logic. The person might think that a higher number represents a higher PI.
# 1 is or not at all interested?
# 2 is not very
# 3 is Fairly
# 4 is Very
temp_data <- temp_data %>%
mutate(PI = case_when(
PI == 1 ~ 4,
PI == 2 ~ 3,
PI == 3 ~ 2,
PI == 4 ~ 1
))
# Education
# The line of demarcation is EDUCATION, so 0 is not graduate, 1 is graduate.
temp_data <- temp_data %>%
mutate(edu = case_when(
edu == 1 ~ 1,
edu == 2 ~ 1,
edu == 3 ~ 1,
edu == 4 ~ 1,
edu == 5 ~ 1,
edu == 6 ~ 1,
edu == 7 ~ 1,
edu == 8 ~ 1,
edu == 9 ~ 1,
edu == 10 ~ 1,
edu == 11 ~ 1,
edu == 12 ~ 1,
edu == 13 ~ 1,
edu == 14 ~ 1,
edu == 15 ~ 1,
edu == 16 ~ 1,
edu == 96 ~ 0
))
# Marital_status- 3 is widow, 4 is divorce
temp_data <- temp_data %>%
mutate(marital_status = case_when(
marital_status == 1 ~ 0,
marital_status == 2 ~ 0,
marital_status == 3 ~ 1,
marital_status == 4 ~ 2,
marital_status == 5 ~ 0,
marital_status == 6 ~ 0
))
# Setting Log_income
threshold <- 1e-10
# There existing the negative value in the income
temp_data <- temp_data %>%
mutate(income = ifelse(income <= 0, threshold, income))
# Log
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(log_income = log(income)) %>%
ungroup()
# Setting time Lag t-1
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_retirement_1 = lag(retirement)) %>%
ungroup()
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_health_1 = lag(health)) %>%
ungroup()
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_marital_status_1 = lag(marital_status)) %>%
ungroup()
# Setting time Lag t-2
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_retirement_2 = lag(retirement,2)) %>%
ungroup()
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_health_2 = lag(health,2)) %>%
ungroup()
temp_data <- temp_data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_marital_status_2 = lag(marital_status,2)) %>%
ungroup()
# Categorical variables
temp_data$sex <- as.factor(temp_data$sex)
temp_data$edu <- as.factor(temp_data$edu)
temp_data$employment <- as.factor(temp_data$employment)
temp_data$marital_status <- as.factor(temp_data$marital_status)
temp_data$retirement <- as.factor(temp_data$retirement)
temp_data$health <- as.factor(temp_data$health)
temp_data$lag_marital_status_1 <- as.factor(temp_data$lag_marital_status_1)
temp_data$lag_marital_status_2 <- as.factor(temp_data$lag_marital_status_2)
temp_data$lag_retirement_1 <- as.factor(temp_data$lag_retirement_1)
temp_data$lag_retirement_2 <- as.factor(temp_data$lag_retirement_2)
temp_data$lag_health_1 <- as.factor(temp_data$lag_health_1)
temp_data$lag_health_2 <- as.factor(temp_data$lag_health_2)
# Select aging population (first wave aged 60-69)
Longitudinal_data <- temp_data %>%
arrange(pidp, wave) %>%
group_by(pidp) %>%
filter(n() == 11,
first(age) >= 60,
first(age) <= 69) %>%
ungroup()
# Recode the value variable
data <- Longitudinal_data %>%
ungroup() %>%
select(pidp:lag_marital_status_2) %>%
mutate(health = zap_label(health),
health = as.factor(health),
age = zap_label(age),
age = as.numeric(age),
income = zap_label(income),
income = as.numeric(income),
employment = fct_recode(employment,
'yes' = '1',
'no' = '2'),
employment = fct_relevel(employment,'no','yes'),
sex = fct_recode(sex,
'male' = '1',
'female' = '2'),
sex = fct_drop(sex),
health = fct_recode(health,
'yes' = '1',
'no' = '2'),
health = fct_relevel(health,'no','yes'),
edu = fct_recode(edu,
'not graduate' = '0',
'graduate' = '1'),
marital_status = fct_recode(marital_status,
'none' = '0',
'widow' = '1',
'divorce' = '2'),
retired = fct_recode(retirement,
'no' = '0',
'yes' = '1'),
lag_retired_1 = fct_recode(lag_retirement_1,
'no' = '0',
'yes' = '1'),
lag_retired_2 = fct_recode(lag_retirement_2,
'no' = '0',
'yes' = '1'),
lag_health_1 = fct_recode(lag_health_1,
'yes' = '1',
'no' = '2'),
lag_health_2 = fct_recode(lag_health_2,
'yes' = '1',
'no' = '2'),
lag_marital_status_1 = fct_recode(lag_marital_status_1,
'none' = '0',
'widow' = '1',
'divorce' = '2'),
lag_marital_status_2 = fct_recode(lag_marital_status_2,
'none' = '0',
'widow' = '1',
'divorce' = '2')) %>%
select(-retirement, -lag_retirement_1, -lag_retirement_2)
vis_miss(data)
### 3.1 Politting age and politcial interest
data %>%
mutate(PI = as.factor(PI)) %>%
count(PI) %>%
ggplot(aes(x = PI, y = n, fill = PI)) +
geom_col() +
scale_fill_brewer(palette = 'RdBu',
labels = c("Or not at all interested", "Not very interested", "Fairly interested", "Very interested")) +
labs(x = 'Political Interest',
y = 'Frequency',
title = 'The Frequency of Political Interest') +
theme_bw()
data %>%
mutate(PI = as.factor(PI)) %>%
count(PI, wave) %>%
ggplot(aes(x = wave, y = n, fill = PI)) +
geom_bar(stat = 'identity', position = 'fill') +
scale_x_continuous(breaks = 1:11) +
scale_y_continuous(labels = percent) +
scale_fill_brewer(palette = 'RdBu',
labels = c("Or not at all interested", "Not very interested", "Fairly interested", "Very interested")) +
labs(x = 'Waves',
y = 'Pencent (%)',
title = 'The Percentage of Each Degree of Political Interest in 11 Waves') +
theme_bw()+
theme_classic() +
theme(plot.title = element_text(size = 12))
data %>%
mutate(PI = as.factor(PI)) %>%
ggplot(aes(x = PI,y = age)) +
geom_violin(mapping = aes(fill = PI),alpha = 0.5) +
geom_jitter(mapping = aes(color = PI),alpha = 0.3) +
scale_fill_brewer(palette = 'RdBu',
labels = c("Or not at all interested", "Not very interested", "Fairly interested", "Very interested")) +
scale_colour_brewer(palette = 'RdBu',
labels = c("Or not at all interested", "Not very interested", "Fairly interested", "Very interested")) +
labs(x = 'Political Interest',
y = 'Ages',
title = 'The Degree of Political Interest at Different Ages') +
theme_bw()
plot_grpfrq(data$PI, data$age, type = "violin")
The figures shows that older people of different ages are concentrated
in different levels of political interest.
data %>%
count(PI,retired) %>%
ggplot(aes(x = PI,y = n,fill = retired)) +
geom_bar(stat = 'identity',position = 'fill') +
scale_y_continuous(labels = percent) +
labs(x = 'Political Interest',
y = 'Pencent (%)',
fill = 'Retirement Status',
title = 'The Percent of Political Interest at Different Retirement Status') +
theme_bw() +
theme(plot.title = element_text(size = 12))
data %>%
mutate(PI = as.factor(PI)) %>%
ggplot(aes(x = PI,y = income)) +
geom_violin(mapping = aes(fill = PI),alpha = 0.5) +
geom_jitter(mapping = aes(color = PI),alpha = 0.1) +
scale_fill_brewer(palette = 'RdBu',
labels = c("Or not at all interested", "Not very interested", "Fairly interested", "Very interested")) +
scale_colour_brewer(palette = 'RdBu',
labels = c("Or not at all interested", "Not very interested", "Fairly interested", "Very interested")) +
scale_y_sqrt() +
labs(x = 'Political Interest',
y = 'Income',
title = 'The Degree of Political Interest at Different Incomes') +
theme_bw()
data %>%
count(PI,health) %>%
ggplot(aes(x = PI,y = n,fill = health)) +
geom_bar(stat = 'identity',position = 'fill') +
scale_y_continuous(labels = percent) +
labs(x = 'Political Interest',
y = 'Pencent (%)',
fill = 'Health',
title = 'The Percent of Political Interest at Different Healths') +
theme_bw() +
theme(plot.title = element_text(size = 12))
data %>%
count(PI, marital_status) %>%
ggplot(aes(x = PI, y = n, fill = marital_status)) +
geom_bar(stat = 'identity', position = 'fill') +
scale_y_continuous(labels = percent) +
labs(x = 'Political Interest',
y = 'Percent (%)',
fill = 'marital_status',
title = 'The Percent of Political Interest at Different marital_status') +
theme_bw() +
theme(plot.title = element_text(size = 12))
### 3.6 Descriptive Statistics
Descriptive <- summary(data)
print(Descriptive)
pidp PI age employment sex
Min. :1.021e+09 Min. :1.000 Min. :60.0 no :5184 male :2915
1st Qu.:1.360e+09 1st Qu.:2.000 1st Qu.:66.0 yes:1031 female:3300
Median :1.429e+09 Median :3.000 Median :69.0
Mean :1.436e+09 Mean :2.682 Mean :69.1
3rd Qu.:1.564e+09 3rd Qu.:3.000 3rd Qu.:72.0
Max. :1.634e+09 Max. :4.000 Max. :81.0
health edu marital_status income wave
no :3169 not graduate:1952 none :4703 Min. : 0 Min. : 1
yes:3046 graduate :4263 widow : 779 1st Qu.: 866 1st Qu.: 3
divorce: 733 Median : 1315 Median : 6
Mean : 1640 Mean : 6
3rd Qu.: 1996 3rd Qu.: 9
Max. :33184 Max. :11
log_income lag_health_1 lag_marital_status_1 lag_health_2
Min. :-23.026 yes :2763 none :4295 yes :2508
1st Qu.: 6.764 no :2887 widow : 686 no :2577
Median : 7.182 NA's: 565 divorce: 669 NA's:1130
Mean : 6.966 NA's : 565
3rd Qu.: 7.599
Max. : 10.410
lag_marital_status_2 retired lag_retired_1 lag_retired_2
none :3882 no :2598 no :2362 no :2126
widow : 600 yes:3617 yes :3288 yes :2959
divorce: 603 NA's: 565 NA's:1130
NA's :1130
m0 <- lmer(PI ~ (1 | pidp), data = data)
summary(m0, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ (1 | pidp)
Data: data
REML criterion at convergence: 11101.8
Scaled residuals:
Min 1Q Median 3Q Max
-4.9091 -0.4770 0.0257 0.5371 4.8796
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5508 0.7421
Residual 0.2614 0.5112
Number of obs: 6215, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 2.68238 0.03189 84.12
The aim of the study is to examine how the dependent variable PI depends on different pidp (that is, individual ids). The random intercept of the model only considers different pidp (individual ids) without any fixed effects. The model means that you are estimating the variance of the average PI value for each pidp, as well as the overall average of the PI values. The hierarchy of longitudinal data is panel data, Level 1: occasions, Level 2: pidp (Bell & Fairvrother & Jones, 2018).
REML criterion at convergence:11101.8 Random effects: -pidp (Intercept): the Variance of the random intercept for pidp is 0.5508, Std. Dev. is 0.7421. -Residual:the Variance of the model residual is 0.2614, Std. Dev. is 0.5112. The variance of the residuals (0.2614) is smaller than the variance of the pidp (0.5508). Therefore, The variation between older people is larger than the variation between occasions within older people.
Fixed effects: For constant term, estimate value 2.68238, Std. Error 0.03189, t value 84.12. The result of fixed effect means that the overall mean of the PI is 2.68238.
A first step in modelling the between occasion within older people, or level 1, variation would be to fit a fixed linear trend. Age as fixed explanatory variables.
m1 <- lmer(PI ~ age + (1 | pidp), data = data)
summary(m1, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + (1 | pidp)
Data: data
REML criterion at convergence: 11067.3
Scaled residuals:
Min 1Q Median 3Q Max
-4.8734 -0.4631 0.0242 0.5295 4.8812
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5476 0.7400
Residual 0.2595 0.5094
Number of obs: 6215, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.847213 0.127852 14.448
age 0.012086 0.001792 6.744
REML criterion at convergence: 11067.3. The REML value is smaller compared to the previous model m0 without fixed explanatory variables. The resulte means that the model with fixed explanatory variables fits better.
Random effects: -pidp (Intercept): The variance of the pidp (Intercept) decreased slightly, from 0.5508 to 0.5476. The decrease is slight. -Resdisual: The variance of the residuals also decreased slightly, from 0.2614 to 0.2595. The variance of the random effects decreased slightly. Because the decrease was samll, suggesting that age did not explain a great deal of the variation within pidp.
Fixed effects: The intercept decreases from 2.69534 to 1.847213. The result means that the predicted value of the PI is 1.847213 when age is 0. The estimate of age is 0.012086. The result means that when the unit of age increase, the predicted value of the PI increases by 0.012086 units on average. Moreover, the t-value is 6.744, which is usually considered statistically significant, meaning that age is a significant predictor variable.
Likelihood Ratio Test (LRT) is equal to 34.51423, means the m1 fit is better
-2*(logLik(m0)-logLik(m1))
'log Lik.' 34.51423 (df=3)
The section 5.2 will make the coefficient of age random at level 2. The model includes random intercept and random slope.
m2 <- lmer(PI ~ age + (1 + age | pidp), data = data)
summary(m2, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + (1 + age | pidp)
Data: data
REML criterion at convergence: 10919.4
Scaled residuals:
Min 1Q Median 3Q Max
-4.9274 -0.4545 0.0394 0.4875 5.0774
Random effects:
Groups Name Variance Std.Dev. Corr
pidp (Intercept) 7.05370 2.65588
age 0.00128 0.03578 -0.96
Residual 0.23926 0.48914
Number of obs: 6215, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.843579 0.163468 11.28
age 0.012221 0.002288 5.34
optimizer (nloptwrap) convergence code: 0 (OK)
Model failed to converge with max|grad| = 1.31049 (tol = 0.002, component 1)
Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
In the m2, the random slope is age. The definition of random slope is allowing the associations between variables to vary across higher-level entities (Bell & Fairbrothers &Jones, 2018).
Random Effect: - The variance of pidp (Intercept) is 7.05370 with a standard deviation of 2.65588, which means that there is significant variation in the baseline PI value (the value when age is 0) between different pidp groups. - The variance of age is 0.00128 with a standard deviation of 0.03578. this means that there is also significant variation in the slope of the effect of age on PI between different pidp groups. - The value of Corr is -0.96,and the value close to -1. Thus, the age as random slope is insignificant. Because everyone gets same coefficient for age and gets same random effect. Moreover, model failed to converge, and the model needs to be simplified. The study of the project focus on the group of older people. As the samples all about older people, thus the coefficient may not much different. Therefore, the following model will drop age as random slope and only include pidp as random intercept. The conclusion is different from the Bell & Fairbrothers &Jones (2018) that within-between RE model needs to include random slopes as much as possible.
Fixed Effect: - Intercept 1.843579 is the estimated average value of the PI when age is zero. Std. Error: The value of Std. Error is 0.163468. T-value is 11.28 which is statistically significant. -age The variable age is a significant predictor of the dependent variable. The coefficient for age is 0.012221. For every one-unit increase in age, the predicted value of PI increases by 0.012221 units, holding other factors constant. The Std. Error is 0.002288. T-value is 5.34. The result shows that the relationship between age and DV is statistically significant.
Likelihood Ratio Test (LRT) is equal to 147.8674.
-2*(logLik(m1)-logLik(m2))
'log Lik.' 147.8674 (df=4)
To generate the predicted slopes for each older people type
# Select the first 50 unique pidp
selected_pidp = unique(data$pidp)[30:50]
# Filter the data
selected_data = data[data$pidp %in% selected_pidp,]
# Generate fitted scores for the selected samples
selected_predscore = fitted(m2)[data$pidp %in% selected_pidp]
# Make the plot
xyplot(selected_predscore ~ age, data = selected_data, groups = pidp, type = c("p", "l"), col = "blue")
Since there are too many pidp (individuals), some of the pidp are
selected and the predictor scores of the predicted political interest
score are plotted. According to the picture, most of the pidp will have
a predictor value that rises with age.
Check for non-linear relationships
m3 <- lmer(PI ~ age + I(age^2) + (1 | pidp), data = data)
summary(m3)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + I(age^2) + (1 | pidp)
Data: data
REML criterion at convergence: 11066.1
Scaled residuals:
Min 1Q Median 3Q Max
-4.8999 -0.4739 0.0197 0.5239 4.8617
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5475 0.7399
Residual 0.2588 0.5087
Number of obs: 6215, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) -3.7566362 1.4259996 -2.634
age 0.1744738 0.0411951 4.235
I(age^2) -0.0011716 0.0002969 -3.946
Correlation of Fixed Effects:
(Intr) age
age -0.999
I(age^2) 0.996 -0.999
# Select the first 50 unique pidp
selected_pidp = unique(data$pidp)[30:50]
# Filter the data
selected_data = data[data$pidp %in% selected_pidp,]
# Generate fitted scores for the selected samples
selected_predscore_2 = fitted(m3)[data$pidp %in% selected_pidp]
# Make the plot
xyplot(selected_predscore_2 ~ age, data = selected_data, groups = pidp, type = c("p", "l"), col = "blue")
The pciture shows that with the age increase, the political interest
predicted score will increase then decrase. The line of predicted score
is nolinear.
The likelihood statistic value is negative -146.7237. The result shows that the quadratic term would not improve the model.
-2*(logLik(m2)-logLik(m3))
'log Lik.' -146.7237 (df=6)
# Time-lag for income
data <- data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_income_1 = lag(log_income)) %>%
ungroup()
data <- data %>%
group_by(pidp) %>%
arrange(wave) %>%
mutate(lag_income_2 = lag(log_income,2)) %>%
ungroup()
gm1 <- lmer(PI ~ age + retired + health + marital_status + health + log_income + (1 |pidp), data = data)
summary(gm1, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + retired + health + marital_status + health + log_income +
(1 | pidp)
Data: data
REML criterion at convergence: 11086.4
Scaled residuals:
Min 1Q Median 3Q Max
-4.8791 -0.4674 0.0225 0.5293 4.8221
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5401 0.7349
Residual 0.2596 0.5095
Number of obs: 6215, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.811367 0.133620 13.556
age 0.012884 0.001821 7.075
retiredyes -0.010130 0.063366 -0.160
healthyes -0.004139 0.017491 -0.237
marital_statuswidow -0.127886 0.043191 -2.961
marital_statusdivorce -0.050341 0.053953 -0.933
log_income 0.001519 0.002847 0.534
summ(gm1,confint = T)
MODEL INFO:
Observations: 6215
Dependent Variable: PI
Type: Mixed effects linear regression
MODEL FIT:
AIC = 11104.36, BIC = 11164.97
Pseudo-R² (fixed effects) = 0.01
Pseudo-R² (total) = 0.68
FIXED EFFECTS:
------------------------------------------------------------
Est. 2.5% 97.5% t val.
--------------------------- ------- ------- ------- --------
(Intercept) 1.81 1.55 2.07 13.56
age 0.01 0.01 0.02 7.07
retiredyes -0.01 -0.13 0.11 -0.16
healthyes -0.00 -0.04 0.03 -0.24
marital_statuswidow -0.13 -0.21 -0.04 -2.96
marital_statusdivorce -0.05 -0.16 0.06 -0.93
log_income 0.00 -0.00 0.01 0.53
------------------------------------------------------------
RANDOM EFFECTS:
------------------------------------
Group Parameter Std. Dev.
---------- ------------- -----------
pidp (Intercept) 0.73
Residual 0.51
------------------------------------
Grouping variables:
-------------------------
Group # groups ICC
------- ---------- ------
pidp 565 0.68
-------------------------
gm2 <- lmer(PI ~ age + lag_retired_1 + lag_health_1 + lag_marital_status_1 + lag_income_1 + (1 |pidp), data = data, na.action=na.exclude)
summary(gm2, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + lag_retired_1 + lag_health_1 + lag_marital_status_1 +
lag_income_1 + (1 | pidp)
Data: data
REML criterion at convergence: 10242.3
Scaled residuals:
Min 1Q Median 3Q Max
-4.8643 -0.4878 0.0311 0.5199 4.7698
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5365 0.7325
Residual 0.2623 0.5122
Number of obs: 5650, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.6304375 0.1501345 10.860
age 0.0156351 0.0020725 7.544
lag_retired_1yes -0.0075661 0.0633806 -0.119
lag_health_1no 0.0152044 0.0186063 0.817
lag_marital_status_1widow -0.1562918 0.0467054 -3.346
lag_marital_status_1divorce -0.1232051 0.0560884 -2.197
lag_income_1 -0.0007314 0.0031544 -0.232
summ(gm2,confint = T)
MODEL INFO:
Observations: 6215
Dependent Variable: PI
Type: Mixed effects linear regression
MODEL FIT:
AIC = 10260.30, BIC = 10320.05
Pseudo-R² (fixed effects) = 0.01
Pseudo-R² (total) = 0.67
FIXED EFFECTS:
------------------------------------------------------------------
Est. 2.5% 97.5% t val.
--------------------------------- ------- ------- ------- --------
(Intercept) 1.63 1.34 1.92 10.86
age 0.02 0.01 0.02 7.54
lag_retired_1yes -0.01 -0.13 0.12 -0.12
lag_health_1no 0.02 -0.02 0.05 0.82
lag_marital_status_1widow -0.16 -0.25 -0.06 -3.35
lag_marital_status_1divorce -0.12 -0.23 -0.01 -2.20
lag_income_1 -0.00 -0.01 0.01 -0.23
------------------------------------------------------------------
RANDOM EFFECTS:
------------------------------------
Group Parameter Std. Dev.
---------- ------------- -----------
pidp (Intercept) 0.73
Residual 0.51
------------------------------------
Grouping variables:
-------------------------
Group # groups ICC
------- ---------- ------
pidp 565 0.67
-------------------------
gm3 <- lmer(PI ~ age + lag_retired_2 + lag_health_2 + lag_marital_status_2 + lag_income_2 + (1 |pidp), data = data, na.action=na.exclude)
summary(gm3, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + lag_retired_2 + lag_health_2 + lag_marital_status_2 +
lag_income_2 + (1 | pidp)
Data: data
REML criterion at convergence: 9231.6
Scaled residuals:
Min 1Q Median 3Q Max
-4.3741 -0.4713 0.0432 0.5158 4.6953
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5343 0.7310
Residual 0.2564 0.5064
Number of obs: 5085, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.712915 0.169259 10.120
age 0.014063 0.002353 5.978
lag_retired_2yes -0.024600 0.063427 -0.388
lag_health_2no 0.027125 0.019628 1.382
lag_marital_status_2widow -0.134932 0.050824 -2.655
lag_marital_status_2divorce -0.244441 0.058991 -4.144
lag_income_2 0.005629 0.003419 1.646
summ(gm3,confint = T)
MODEL INFO:
Observations: 6215
Dependent Variable: PI
Type: Mixed effects linear regression
MODEL FIT:
AIC = 9249.58, BIC = 9308.38
Pseudo-R² (fixed effects) = 0.01
Pseudo-R² (total) = 0.68
FIXED EFFECTS:
------------------------------------------------------------------
Est. 2.5% 97.5% t val.
--------------------------------- ------- ------- ------- --------
(Intercept) 1.71 1.38 2.04 10.12
age 0.01 0.01 0.02 5.98
lag_retired_2yes -0.02 -0.15 0.10 -0.39
lag_health_2no 0.03 -0.01 0.07 1.38
lag_marital_status_2widow -0.13 -0.23 -0.04 -2.65
lag_marital_status_2divorce -0.24 -0.36 -0.13 -4.14
lag_income_2 0.01 -0.00 0.01 1.65
------------------------------------------------------------------
RANDOM EFFECTS:
------------------------------------
Group Parameter Std. Dev.
---------- ------------- -----------
pidp (Intercept) 0.73
Residual 0.51
------------------------------------
Grouping variables:
-------------------------
Group # groups ICC
------- ---------- ------
pidp 565 0.68
-------------------------
The value of REML is 9231.6. For the random effects, the variance of pidp intercept is 0.5343, the standard deviation is 0.7310. The variance of residual is 0.2565, the standard deviation is 0.5064. The Intra-Class Correlation ICC is 0.68. In other words, 68% of the total variability in political interest is attributed to differences between individual groups. The remaining 32% of the variability comes from differences within the same pidp group across different occasions.
The results of this thesis support the inference of hypothesis1. The estimate value of age is 0.014063, standard deviation is 0.002353. Age is statistically significant with political interest because t-value is 5.978. Holding other variable constant, an increase in age by 1 unit is associated with a change in political by 0.014063 units on average with a 95% confidence interval of (0.01, 0.02). The predicted value of political interest as figure11 showed. The relationship between age and political interest is positive. Moreover, the estimate value of intercept is 1.712915, and standard deviation is 0.169259. Intercept is statistically significant because t-value is 10.120. When all variables are 0, the average value of political interest is 1.712915 with a 95% confidence interval of (1.38, 2.04).
The estimate value of retirement is -0.024600, the standard deviation is 0.063427. Compared to those who were not retired two periods ago, those who retired two periods ago are expected to have a 0.026 units lower political interest with confidence interval (-0.15, 0.10). However, t-value is -0.388, which means retirement may not statistically significant.
The estimate value of income is 0.005629, standard deviation is 0.003419. The t-value of income is 1.646, which is statistically significant. An increase in income by 1% from two periods ago is associated with a change in political interest by 0.00005629 units on average at 90% confidence interval (-0.00, 0.01).
The estimate value of health is 0.027125, the standard deviation is 0.019628. Compared to those who were not retired two periods ago, those who retired two periods ago are expected to have a 0.027125 units lower political interest with a confidence interval (-0.01, 0.07). However, t-value is 1.382, which is not 95% statistically significant.
The estimate value of widow is -0.134932, the standard deviation is 0.050824. The t-value of widow is -2.655, which is statistically significant. Compared to other marital statuses, those who were widowed two periods ago are expected to have a 0.134932 units lower political interest, with a 95% confidence interval of (-0.23, -0.04). The estimate value of dicorce is -0.244441, the standard deviation is 0.058991, t-value is -4.141. Divorce is statistically significant. Compared to other marital statuses, those who were divorced two periods ago are expected to have a 0.244 units lower political interest, with a 95% confidence interval of (-0.36, -0.13).
stargazer(gm3, type = 'text')
=======================================================
Dependent variable:
---------------------------
PI
-------------------------------------------------------
age 0.014***
(0.002)
lag_retired_2yes -0.025
(0.063)
lag_health_2no 0.027
(0.020)
lag_marital_status_2widow -0.135***
(0.051)
lag_marital_status_2divorce -0.244***
(0.059)
lag_income_2 0.006*
(0.003)
Constant 1.713***
(0.169)
-------------------------------------------------------
Observations 6,215
Log Likelihood -4,615.788
Akaike Inf. Crit. 9,249.576
Bayesian Inf. Crit. 9,308.382
=======================================================
Note: *p<0.1; **p<0.05; ***p<0.01
gm4 <- lmer(PI ~ age + lag_retired_2 + lag_health_2 + lag_marital_status_2 + log_income + (1 |pidp), data = data, na.action=na.exclude)
summary(gm4, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + lag_retired_2 + lag_health_2 + lag_marital_status_2 +
log_income + (1 | pidp)
Data: data
REML criterion at convergence: 9231.8
Scaled residuals:
Min 1Q Median 3Q Max
-4.6730 -0.4711 0.0425 0.5152 4.6948
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5343 0.7309
Residual 0.2565 0.5064
Number of obs: 5085, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.686239 0.171462 9.834
age 0.014449 0.002346 6.160
lag_retired_2yes -0.026014 0.063412 -0.410
lag_health_2no 0.027261 0.019631 1.389
lag_marital_status_2widow -0.135291 0.050831 -2.662
lag_marital_status_2divorce -0.244269 0.058992 -4.141
log_income 0.005621 0.003661 1.536
summ(gm4,confint = T)
MODEL INFO:
Observations: 6215
Dependent Variable: PI
Type: Mixed effects linear regression
MODEL FIT:
AIC = 9249.79, BIC = 9308.60
Pseudo-R² (fixed effects) = 0.01
Pseudo-R² (total) = 0.68
FIXED EFFECTS:
------------------------------------------------------------------
Est. 2.5% 97.5% t val.
--------------------------------- ------- ------- ------- --------
(Intercept) 1.69 1.35 2.02 9.83
age 0.01 0.01 0.02 6.16
lag_retired_2yes -0.03 -0.15 0.10 -0.41
lag_health_2no 0.03 -0.01 0.07 1.39
lag_marital_status_2widow -0.14 -0.23 -0.04 -2.66
lag_marital_status_2divorce -0.24 -0.36 -0.13 -4.14
log_income 0.01 -0.00 0.01 1.54
------------------------------------------------------------------
RANDOM EFFECTS:
------------------------------------
Group Parameter Std. Dev.
---------- ------------- -----------
pidp (Intercept) 0.73
Residual 0.51
------------------------------------
Grouping variables:
-------------------------
Group # groups ICC
------- ---------- ------
pidp 565 0.68
-------------------------
gm5 <- lmer(PI ~ age + lag_retired_2 + lag_health_2 + lag_marital_status_2 + lag_income_1 + (1 |pidp), data = data, na.action=na.exclude)
summary(gm5, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + lag_retired_2 + lag_health_2 + lag_marital_status_2 +
lag_income_1 + (1 | pidp)
Data: data
REML criterion at convergence: 9232.3
Scaled residuals:
Min 1Q Median 3Q Max
-4.7055 -0.4681 0.0434 0.5171 4.6942
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5346 0.7312
Residual 0.2565 0.5064
Number of obs: 5085, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.699077 0.170663 9.956
age 0.014314 0.002346 6.102
lag_retired_2yes -0.025706 0.063436 -0.405
lag_health_2no 0.026832 0.019628 1.367
lag_marital_status_2widow -0.135055 0.050835 -2.657
lag_marital_status_2divorce -0.243997 0.058998 -4.136
lag_income_1 0.005143 0.003859 1.333
summ(gm5,confint = T)
MODEL INFO:
Observations: 6215
Dependent Variable: PI
Type: Mixed effects linear regression
MODEL FIT:
AIC = 9250.27, BIC = 9309.07
Pseudo-R² (fixed effects) = 0.01
Pseudo-R² (total) = 0.68
FIXED EFFECTS:
------------------------------------------------------------------
Est. 2.5% 97.5% t val.
--------------------------------- ------- ------- ------- --------
(Intercept) 1.70 1.36 2.03 9.96
age 0.01 0.01 0.02 6.10
lag_retired_2yes -0.03 -0.15 0.10 -0.41
lag_health_2no 0.03 -0.01 0.07 1.37
lag_marital_status_2widow -0.14 -0.23 -0.04 -2.66
lag_marital_status_2divorce -0.24 -0.36 -0.13 -4.14
lag_income_1 0.01 -0.00 0.01 1.33
------------------------------------------------------------------
RANDOM EFFECTS:
------------------------------------
Group Parameter Std. Dev.
---------- ------------- -----------
pidp (Intercept) 0.73
Residual 0.51
------------------------------------
Grouping variables:
-------------------------
Group # groups ICC
------- ---------- ------
pidp 565 0.68
-------------------------
gm6 <- lmer(PI ~ age + lag_retired_1 + lag_health_2+ lag_marital_status_2 + lag_income_2 + (1 |pidp), data = data, na.action=na.exclude)
summary(gm6, correlation = FALSE)
Linear mixed model fit by REML ['lmerMod']
Formula: PI ~ age + lag_retired_1 + lag_health_2 + lag_marital_status_2 +
lag_income_2 + (1 | pidp)
Data: data
REML criterion at convergence: 9231.6
Scaled residuals:
Min 1Q Median 3Q Max
-4.3729 -0.4719 0.0426 0.5161 4.6952
Random effects:
Groups Name Variance Std.Dev.
pidp (Intercept) 0.5343 0.7310
Residual 0.2564 0.5064
Number of obs: 5085, groups: pidp, 565
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.709030 0.169441 10.086
age 0.014032 0.002352 5.966
lag_retired_1yes -0.014479 0.064270 -0.225
lag_health_2no 0.027154 0.019628 1.383
lag_marital_status_2widow -0.135105 0.050826 -2.658
lag_marital_status_2divorce -0.244088 0.058993 -4.138
lag_income_2 0.005643 0.003419 1.650
summ(gm6,confint = T)
MODEL INFO:
Observations: 6215
Dependent Variable: PI
Type: Mixed effects linear regression
MODEL FIT:
AIC = 9249.65, BIC = 9308.46
Pseudo-R² (fixed effects) = 0.01
Pseudo-R² (total) = 0.68
FIXED EFFECTS:
------------------------------------------------------------------
Est. 2.5% 97.5% t val.
--------------------------------- ------- ------- ------- --------
(Intercept) 1.71 1.38 2.04 10.09
age 0.01 0.01 0.02 5.97
lag_retired_1yes -0.01 -0.14 0.11 -0.23
lag_health_2no 0.03 -0.01 0.07 1.38
lag_marital_status_2widow -0.14 -0.23 -0.04 -2.66
lag_marital_status_2divorce -0.24 -0.36 -0.13 -4.14
lag_income_2 0.01 -0.00 0.01 1.65
------------------------------------------------------------------
RANDOM EFFECTS:
------------------------------------
Group Parameter Std. Dev.
---------- ------------- -----------
pidp (Intercept) 0.73
Residual 0.51
------------------------------------
Grouping variables:
-------------------------
Group # groups ICC
------- ---------- ------
pidp 565 0.68
-------------------------
stargazer(m0, m1, m2, m3, gm1, gm2, gm3, gm4, gm5,gm6, type = 'text')
=========================================================================================================================================
Dependent variable:
-------------------------------------------------------------------------------------------------------------
PI
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10)
-----------------------------------------------------------------------------------------------------------------------------------------
age 0.012*** 0.012*** 0.174*** 0.013*** 0.016*** 0.014*** 0.014*** 0.014*** 0.014***
(0.002) (0.002) (0.041) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002)
I(age2) -0.001***
(0.0003)
retiredyes -0.010
(0.063)
healthyes -0.004
(0.017)
marital_statuswidow -0.128***
(0.043)
marital_statusdivorce -0.050
(0.054)
log_income 0.002 0.006
(0.003) (0.004)
lag_retired_1yes -0.008 -0.014
(0.063) (0.064)
lag_health_1no 0.015
(0.019)
lag_marital_status_1widow -0.156***
(0.047)
lag_marital_status_1divorce -0.123**
(0.056)
lag_income_1 -0.001 0.005
(0.003) (0.004)
lag_retired_2yes -0.025 -0.026 -0.026
(0.063) (0.063) (0.063)
lag_health_2no 0.027 0.027 0.027 0.027
(0.020) (0.020) (0.020) (0.020)
lag_marital_status_2widow -0.135*** -0.135*** -0.135*** -0.135***
(0.051) (0.051) (0.051) (0.051)
lag_marital_status_2divorce -0.244*** -0.244*** -0.244*** -0.244***
(0.059) (0.059) (0.059) (0.059)
lag_income_2 0.006* 0.006*
(0.003) (0.003)
Constant 2.682*** 1.847*** 1.844*** -3.757*** 1.811*** 1.630*** 1.713*** 1.686*** 1.699*** 1.709***
(0.032) (0.128) (0.163) (1.426) (0.134) (0.150) (0.169) (0.171) (0.171) (0.169)
-----------------------------------------------------------------------------------------------------------------------------------------
Observations 6,215 6,215 6,215 6,215 6,215 6,215 6,215 6,215 6,215 6,215
Log Likelihood -5,550.904 -5,533.646 -5,459.713 -5,533.075 -5,543.179 -5,121.148 -4,615.788 -4,615.896 -4,616.133 -4,615.825
Akaike Inf. Crit. 11,107.810 11,075.290 10,931.430 11,076.150 11,104.360 10,260.300 9,249.576 9,249.792 9,250.267 9,249.649
Bayesian Inf. Crit. 11,128.010 11,102.230 10,971.830 11,109.820 11,164.970 10,320.050 9,308.382 9,308.598 9,309.073 9,308.456
=========================================================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Overall, the 7th model gm3 is the best fitting model among all multilevel models.
# Visualising the coefficients of the gm3 model
plot_model(gm3,type = 'std') +
theme_bw()
## Visualising the random effect of the gm3 model
# The figure shows the random effects of the gm3 model. Each row represents a older person
plot_model(gm3,type = 're') +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
# Predict the value
plot_model(gm3, type = 'pred')
$age
$lag_retired_2
$lag_health_2
$lag_marital_status_2
$lag_income_2